package cn.doitedu.dmp.crawler;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;

/**
 * 抽取类别下的分页链接
 */
public class AzappListAddrExtract {

    public static void main(String[] args) throws IOException {

        BufferedReader br = new BufferedReader(new FileReader("crawler/data/appcatelog.txt"));
        BufferedWriter bw = new BufferedWriter(new FileWriter("crawler/data/applist_addr.txt"));
        String url = null;
        while (StringUtils.isNotBlank(url = br.readLine())) {

            String[] split = url.split(",");
            Connection connect = Jsoup.connect(split[0]);
            Document doc = connect.get();

            Element div = doc.getElementsByClass("pagebars").get(0);
            Elements aTags = div.getElementsByTag("a");

            for (Element aTag : aTags) {
                String href = aTag.attr("href");
                bw.write(href+","+split[1]);
                bw.newLine();
            }


        }
        bw.close();
    }


}
